{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "47f1964a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#第3章/加载数据集\n",
    "from datasets import load_dataset\n",
    "\n",
    "# dataset = load_dataset(path='seamew/ChnSentiCorp')\n",
    "\n",
    "# dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a248b870",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#第3章/加载glue数据集\n",
    "# load_dataset(path='glue', name='sst2', split='train')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "685b590b",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "#第3章/保存数据集到磁盘\n",
    "# dataset.save_to_disk(\n",
    "#     dataset_dict_path='./data/ChnSentiCorp')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "fcf7c912",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 9600\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 0\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 1200\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/从磁盘加载数据集\n",
    "from datasets import load_from_disk\n",
    "\n",
    "dataset = load_from_disk('./data/ChnSentiCorp')\n",
    "\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a0f376c6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 9600\n",
       "})"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/使用train数据子集做后续的实验\n",
    "dataset = dataset['train']\n",
    "\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "ad4a4acb",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'text': '轻便，方便携带，性能也不错，能满足平时的工作需要，对出差人员来说非常不错', 'label': 1}\n",
      "{'text': '很好的地理位置，一蹋糊涂的服务，萧条的酒店。', 'label': 0}\n",
      "{'text': '非常不错，服务很好，位于市中心区，交通方便，不过价格也高！', 'label': 1}\n",
      "{'text': '跟住招待所没什么太大区别。 绝对不会再住第2次的酒店！', 'label': 0}\n",
      "{'text': '价格太高，性价比不够好。我觉得今后还是去其他酒店比较好。', 'label': 0}\n"
     ]
    }
   ],
   "source": [
    "#第3章/查看数据样例\n",
    "for i in [12, 17, 20, 26, 56]:\n",
    "    print(dataset[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "673595a4",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached sorted indices for dataset at data/ChnSentiCorp/train/cache-42a7d570466d6993.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 1, 0, 0, 1, 0, 0, 0, 1, 1]\n",
      "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n"
     ]
    }
   ],
   "source": [
    "#第3章/排序数据\n",
    "#数据中的label是无序的\n",
    "print(dataset['label'][:10])\n",
    "\n",
    "#让数据按照label排序\n",
    "sorted_dataset = dataset.sort('label')\n",
    "print(sorted_dataset['label'][:10])\n",
    "print(sorted_dataset['label'][-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5611e7c7",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached shuffled indices for dataset at data/ChnSentiCorp/train/cache-c5b262546ff026fd.arrow\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[0, 1, 0, 0, 1, 0, 1, 0, 1, 0]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/打乱数据顺序\n",
    "shuffled_dataset = sorted_dataset.shuffle(seed=42)\n",
    "\n",
    "shuffled_dataset['label'][:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "efab17ec",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 6\n",
       "})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/从数据集中选择某些数据\n",
    "dataset.select([0, 10, 20, 30, 40, 50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ab28ca3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at data/ChnSentiCorp/train/cache-73dc6670e81622db.arrow\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 13\n",
       "})"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/过滤数据\n",
    "def f(data):\n",
    "    return data['text'].startswith('非常不错')\n",
    "\n",
    "\n",
    "dataset.filter(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "fcdd8429",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 8640\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['text', 'label'],\n",
       "        num_rows: 960\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/切分训练集和测试集\n",
    "dataset.train_test_split(test_size=0.1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "57f49494",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text', 'label'],\n",
       "    num_rows: 2400\n",
       "})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/数据分桶\n",
    "dataset.shard(num_shards=4, index=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "f91fc66a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['text_rename', 'label'],\n",
       "    num_rows: 9600\n",
       "})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/字段重命名\n",
    "dataset.rename_column('text', 'text_rename')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "5a391c7d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['label'],\n",
       "    num_rows: 9600\n",
       "})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/删除字段\n",
    "dataset.remove_columns(['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "c7c26c80",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at data/ChnSentiCorp/train/cache-cb2b4292b2d24aab.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "非常不错，服务很好，位于市中心区，交通方便，不过价格也高！\n",
      "My sentence: 非常不错，服务很好，位于市中心区，交通方便，不过价格也高！\n"
     ]
    }
   ],
   "source": [
    "#第3章/应用函数\n",
    "def f(data):\n",
    "    data['text'] = 'My sentence: ' + data['text']\n",
    "    return data\n",
    "\n",
    "\n",
    "maped_datatset = dataset.map(f)\n",
    "\n",
    "print(dataset['text'][20])\n",
    "print(maped_datatset['text'][20])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f9658d55",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at data/ChnSentiCorp/train/cache-1424dcebf8a7e96b.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at data/ChnSentiCorp/train/cache-c84a2f5f53769eed.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at data/ChnSentiCorp/train/cache-343574186c33781b.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " "
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at data/ChnSentiCorp/train/cache-c238ea5b6eec19f0.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "非常不错，服务很好，位于市中心区，交通方便，不过价格也高！\n",
      "My sentence: 非常不错，服务很好，位于市中心区，交通方便，不过价格也高！\n"
     ]
    }
   ],
   "source": [
    "#第3章/使用批处理加速\n",
    "def f(data):\n",
    "    text = data['text']\n",
    "    text = ['My sentence: ' + i for i in text]\n",
    "    data['text'] = text\n",
    "    return data\n",
    "\n",
    "\n",
    "maped_datatset = dataset.map(function=f,\n",
    "                             batched=True,\n",
    "                             batch_size=1000,\n",
    "                             num_proc=4)\n",
    "\n",
    "print(dataset['text'][20])\n",
    "print(maped_datatset['text'][20])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a9530160",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'label': tensor(1), 'text': '非常不错，服务很好，位于市中心区，交通方便，不过价格也高！'}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#第3章/设置数据格式\n",
    "dataset.set_format(type='torch', columns=['label'], output_all_columns=True)\n",
    "\n",
    "dataset[20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "63ccf9d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset chn_senti_corp/default to /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85...\n"
     ]
    },
    {
     "ename": "ConnectionError",
     "evalue": "Couldn't reach https://drive.google.com/u/0/uc?id=1uV-aDQoMI51A27OxVgJnzxqZFQqkDydZ&export=download (ConnectionError(MaxRetryError(\"HTTPSConnectionPool(host='drive.google.com', port=443): Max retries exceeded with url: /u/0/uc?id=1uV-aDQoMI51A27OxVgJnzxqZFQqkDydZ&export=download (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f95b7a09cc0>: Failed to establish a new connection: [Errno 101] Network is unreachable',))\",),))",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mConnectionError\u001b[0m                           Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-18-4cbaaea39b42>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#第3章/导出为csv格式\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdataset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'seamew/ChnSentiCorp'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msplit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_or_buf\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'./data/ChnSentiCorp.csv'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;31m#加载csv格式数据\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)\u001b[0m\n\u001b[1;32m   1682\u001b[0m         \u001b[0mignore_verifications\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_verifications\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1683\u001b[0m         \u001b[0mtry_from_hf_gcs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtry_from_hf_gcs\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1684\u001b[0;31m         \u001b[0muse_auth_token\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0muse_auth_token\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1685\u001b[0m     )\n\u001b[1;32m   1686\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[0;34m(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m    703\u001b[0m                     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdownloaded_from_gcs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    704\u001b[0m                         self._download_and_prepare(\n\u001b[0;32m--> 705\u001b[0;31m                             \u001b[0mdl_manager\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_infos\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverify_infos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mdownload_and_prepare_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    706\u001b[0m                         )\n\u001b[1;32m    707\u001b[0m                     \u001b[0;31m# Sync info\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[0;34m(self, dl_manager, verify_infos)\u001b[0m\n\u001b[1;32m   1219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1220\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_download_and_prepare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_infos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1221\u001b[0;31m         \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_download_and_prepare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_infos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheck_duplicate_keys\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverify_infos\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1222\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1223\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_get_examples_iterable_for_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msplit_generator\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mSplitGenerator\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mExamplesIterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[0;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m    769\u001b[0m         \u001b[0msplit_dict\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mSplitDict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset_name\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    770\u001b[0m         \u001b[0msplit_generators_kwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_make_split_generators_kwargs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 771\u001b[0;31m         \u001b[0msplit_generators\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_split_generators\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0msplit_generators_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    772\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    773\u001b[0m         \u001b[0;31m# Checksums verification\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.cache/huggingface/modules/datasets_modules/datasets/seamew--ChnSentiCorp/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85/ChnSentiCorp.py\u001b[0m in \u001b[0;36m_split_generators\u001b[0;34m(self, dl_manager)\u001b[0m\n\u001b[1;32m     27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     28\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_split_generators\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 29\u001b[0;31m         \u001b[0mtrain_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_TRAIN_DOWNLOAD_URL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     30\u001b[0m         \u001b[0mdev_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_DEV_DOWNLOAD_URL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     31\u001b[0m         \u001b[0mtest_path\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload_and_extract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_TEST_DOWNLOAD_URL\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/download/download_manager.py\u001b[0m in \u001b[0;36mdownload_and_extract\u001b[0;34m(self, url_or_urls)\u001b[0m\n\u001b[1;32m    429\u001b[0m             \u001b[0mextracted_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mextracted\u001b[0m \u001b[0mpaths\u001b[0m \u001b[0mof\u001b[0m \u001b[0mgiven\u001b[0m \u001b[0mURL\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    430\u001b[0m         \"\"\"\n\u001b[0;32m--> 431\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextract\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_or_urls\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    432\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    433\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget_recorded_sizes_checksums\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/download/download_manager.py\u001b[0m in \u001b[0;36mdownload\u001b[0;34m(self, url_or_urls)\u001b[0m\n\u001b[1;32m    313\u001b[0m             \u001b[0mnum_proc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnum_proc\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    314\u001b[0m             \u001b[0mdisable_tqdm\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0mis_progress_bar_enabled\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 315\u001b[0;31m             \u001b[0mdesc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"Downloading data files\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    316\u001b[0m         )\n\u001b[1;32m    317\u001b[0m         \u001b[0mduration\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mstart_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/utils/py_utils.py\u001b[0m in \u001b[0;36mmap_nested\u001b[0;34m(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, types, disable_tqdm, desc)\u001b[0m\n\u001b[1;32m    346\u001b[0m     \u001b[0;31m# Singleton\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    347\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtypes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 348\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_struct\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    349\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    350\u001b[0m     \u001b[0mdisable_tqdm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdisable_tqdm\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_progress_bar_enabled\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/download/download_manager.py\u001b[0m in \u001b[0;36m_download\u001b[0;34m(self, url_or_filename, download_config)\u001b[0m\n\u001b[1;32m    333\u001b[0m             \u001b[0;31m# append the relative path to the base_path\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    334\u001b[0m             \u001b[0murl_or_filename\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0murl_or_path_join\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_base_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl_or_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 335\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mcached_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    336\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    337\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0miter_archive\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath_or_buf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mUnion\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mBufferedReader\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/utils/file_utils.py\u001b[0m in \u001b[0;36mcached_path\u001b[0;34m(url_or_filename, download_config, **download_kwargs)\u001b[0m\n\u001b[1;32m    195\u001b[0m             \u001b[0muse_auth_token\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_auth_token\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    196\u001b[0m             \u001b[0mignore_url_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mignore_url_params\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 197\u001b[0;31m             \u001b[0mdownload_desc\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdownload_desc\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    198\u001b[0m         )\n\u001b[1;32m    199\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/envs/cpu/lib/python3.6/site-packages/datasets/utils/file_utils.py\u001b[0m in \u001b[0;36mget_from_cache\u001b[0;34m(url, cache_dir, force_download, proxies, etag_timeout, resume_download, user_agent, local_files_only, use_etag, max_retries, use_auth_token, ignore_url_params, download_desc)\u001b[0m\n\u001b[1;32m    531\u001b[0m         \u001b[0m_raise_if_offline_mode_is_enabled\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Tried to reach {url}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    532\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mhead_error\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 533\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Couldn't reach {url} ({repr(head_error)})\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    534\u001b[0m         \u001b[0;32melif\u001b[0m \u001b[0mresponse\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    535\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Couldn't reach {url} (error {response.status_code})\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mConnectionError\u001b[0m: Couldn't reach https://drive.google.com/u/0/uc?id=1uV-aDQoMI51A27OxVgJnzxqZFQqkDydZ&export=download (ConnectionError(MaxRetryError(\"HTTPSConnectionPool(host='drive.google.com', port=443): Max retries exceeded with url: /u/0/uc?id=1uV-aDQoMI51A27OxVgJnzxqZFQqkDydZ&export=download (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f95b7a09cc0>: Failed to establish a new connection: [Errno 101] Network is unreachable',))\",),))"
     ]
    }
   ],
   "source": [
    "#第3章/导出为csv格式\n",
    "dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')\n",
    "dataset.to_csv(path_or_buf='./data/ChnSentiCorp.csv')\n",
    "\n",
    "#加载csv格式数据\n",
    "csv_dataset = load_dataset(path='csv',\n",
    "                           data_files='./data/ChnSentiCorp.csv',\n",
    "                           split='train')\n",
    "csv_dataset[20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f10145f",
   "metadata": {},
   "outputs": [],
   "source": [
    "#第3章/导出为json格式\n",
    "dataset = load_dataset(path='seamew/ChnSentiCorp', split='train')\n",
    "dataset.to_json(path_or_buf='./data/ChnSentiCorp.json')\n",
    "\n",
    "#加载json格式数据\n",
    "json_dataset = load_dataset(path='json',\n",
    "                            data_files='./data/ChnSentiCorp.json',\n",
    "                            split='train')\n",
    "json_dataset[20]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
